IMPORTANT NOTE

This is transcript-level analysis. Importantly, it is also possible to use aggregation transcript p-values to perform gene differential expression, as introduced in Yi et al., 2017. In this framework, differential expression is performed on transcripts as usual, but then transcript-level p-values can be aggregated to obtain gene-level p values. The results are therefore different than the old method of gene_mode=T. Also see this walkthrough.

Data pre-processing (always the same)

The original FASTQ files (50 bp, single-end, reverse stranded) were processed with kallisto using a custom transcriptome, which is a combination of the REFSEQ mRNA sequences AND RepBase D. melanogaster TE seqeunces (refGene_plus_repbase)

./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL25 YL25_shW_RNAseq_1st.fq.gz

./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL26 YL26_shSmt3_RNAseq_1st.fq.gz

./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL27 YL27_shW_RNAseq_2nd.fq.gz

./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL28 YL28_shSmt3_RNAseq_2nd.fq.gz

Preparing sleuth object

s2c<-data.frame(sample=c("YL25","YL27","YL26","YL28"),condition=c("ctrl", "ctrl","KD","KD"), 
                path=c("../kallisto/YL25/","../kallisto/YL27/", "../kallisto/YL26/","../kallisto/YL28/"))
s2c$path<-as.character(s2c$path)
print(s2c)
##   sample condition              path
## 1   YL25      ctrl ../kallisto/YL25/
## 2   YL27      ctrl ../kallisto/YL27/
## 3   YL26        KD ../kallisto/YL26/
## 4   YL28        KD ../kallisto/YL28/
#this is to create a transcript to gene name dictionary - I just transform each fasta header.
t<-read.table("../kallisto/YL26/abundance.tsv", header=T, as.is=T)
t2g<-data.frame("target_id"=t$target_id, gene=sub(":CDS.*","",sub(".*gene=","", t$target_id)))
t2g$coding<-grepl("CDS", t2g$target_id)
t2g$TE<-grepl("\\|", t2g$target_id)

# number of elements of different type: with CDS annotation, RepBase entries (TE), and none (ncRNAs or genes with no CDS)
t2g %>% select(gene,coding, TE) %>% distinct() %>% group_by(coding, TE) %>% tally()
## # A tibble: 3 x 3
## # Groups:   coding [2]
##   coding TE        n
##   <lgl>  <lgl> <int>
## 1 FALSE  FALSE  2673
## 2 FALSE  TRUE    238
## 3 TRUE   FALSE 13939
# make "so", note that aggregation column is supplied, but gene_mode is FALSE
so <- sleuth_prep(s2c, ~condition, extra_bootstrap_summary=T, read_bootstrap_tpm = TRUE, aggregation_column = 'gene', target_mapping = t2g, gene_mode=FALSE) 
## reading in kallisto results
## dropping unused factor levels
## ....
## normalizing est_counts
## 18141 targets passed the filter
## normalizing tpm
## merging in metadata
## summarizing bootstraps
## ....


Input values

Here, we can inspect the input data table. This is in transcripts as aggregation is for the p-values.

units
## [1] "est_counts"
table1<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>% 
  rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) 
table1 %>% round(digits=2) %>% datatable(options=list(pageLength=5))
## Warning in instance$preRenderHook(instance): It seems your data is too big for client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

Analysеs

Running sleuth Likelihood Ratio test (LRT), and displaying a table of significantly changed genes/tx, qval<0.05. This table DOES NOT have information about up- or down- regulation.

so <- sleuth_fit(so, ~condition,'full')
so <- sleuth_fit(so, ~1, 'reduced')
so <- sleuth_lrt(so, 'reduced', 'full')

sleuth_results_LRT_gene <- sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE)
sleuth_results_LRT_tx <-sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE, pval_aggregate = F)

#gene aggregated table
dplyr::filter(sleuth_results_LRT_gene, qval <= 0.05) %>% datatable(options = list(pageLength=5))
#tx aggregated table
dplyr::filter(sleuth_results_LRT_tx, qval <= 0.05) %>% datatable(options = list(pageLength=5))


Running wald test & plotting MA plot. Note that there are beta-vals only for the tx-based table where the diff.expr was performed, therefore, the MA-plot is transcript-based.

Note: the displayed table shows all results, however the MA plot does not show txs which did not pass filters, and Inf values.

so <- sleuth_wt(so, 'conditionKD')
sleuth_results_wald_gene <- sleuth_results(so, 'conditionKD', show_all = TRUE)
sleuth_results_wald_tx<- sleuth_results(so, 'conditionKD', show_all = TRUE, pval_aggregate = F)

#gene aggregated table
sleuth_results_wald_gene %>% datatable(options = list(pageLength=5))
#tx aggregated table
sleuth_results_wald_tx %>% datatable(options = list(pageLength=5))
# MA plot, transcript analysis. Adding count data for later.
table.counts<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))

res<-sleuth_results(so, 'conditionKD', show_all = FALSE, pval_aggregate = F) %>% left_join(table.counts, by="target_id") %>% mutate(signFC=(qval<0.05 & abs(log2FC)>1), significant=qval<0.05) %>% arrange(qval)

p<-ggplot(res, aes(mean_obs, b)) + theme_classic() + ggtitle("MA plot of transcripts and TEs, sleuth")
p <- p + xlab(paste("mean( log(counts+ 0.5 ) )"))
p <- p + ylab(paste0("beta: conditionKD")) 
p <- p + geom_text(data=(sleuth_results_wald_tx %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(mean_obs, b, label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=0.5))

p1 <- p + geom_point(aes(colour = significant), alpha = 0.2) + scale_colour_manual(values = c("black", "green4"))
p1

Fig. MA plot from sleuth analysis, Wald test, transcript-based. All transcripts with qvalue<0.05 are highlighted (warning: beta != log2FC).

Below it is the same MA plot, but I only marked genes with >2 fold change, calculated from the original values in the sleuth table above.

p2 <- p + geom_point(aes(colour = signFC), alpha = 0.2) + scale_colour_manual(values = c("black", "red"))
p2

Fig. MA plot from sleuth analysis, transcript-level, Wald test. (warning: beta != log2FC). Only genes with qval<0.05 & log2FC (from kallisto numbers) are highlighted.



Alternative visualization

This is an alternative visualization of the data, where we do a scatter plot in control vs knockdown. We can choose to select the dots based on qvalues either from the Wald test, or the LTR test. The test choice does not change the interpretation of the data in the light of the R1, R2 up-regulation compared to genes.

#values tables
table.tpm<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = "tpm") %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))

tab.raw<-full_join(table.counts, table.tpm, by="target_id",suffix=c(".count",".tpm")) 

### stats tables
tab.wald<-sleuth_results(so, 'conditionKD', show_all = TRUE, pval_aggregate = F) %>% arrange(target_id) # wald test
tab.lrt<-sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE, pval_aggregate = F) %>% arrange(target_id) #lrt test
tab.stat<-full_join(tab.wald, tab.lrt, by="target_id", suffix=c(".wald",".lrt")) 

### all data together, fold change is taken from the est_counts 
tab.plot<-full_join(tab.stat, tab.raw, by="target_id") %>% mutate(signWald=(abs(log2FC.count)>1 & qval.wald<0.05), signLRT=(abs(log2FC.count)>1 & qval.lrt<0.05))

### plot
p.sub<-geom_text(data=(tab.plot %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(log2(x.tpm+1), log2(y.tpm+1), label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=1))

top20.x<-tail(sort(tab.plot$x.tpm),20)[1]
top20.y<-tail(sort(tab.plot$y.tpm),20)[1]

p<-ggplot(tab.plot, aes(log2(x.tpm+1), log2(y.tpm+1))) + theme_classic() + theme(legend.position="left") + p.sub
p<-p + scale_color_manual(values=c("black", "beige", "green4")) 
p<-p + xlab(paste("shW, log2 (av.tpm + 1) ")) + ylab(paste("shSmt3, log2 (av.tpm + 1) "))
p<-p + geom_hline(yintercept = log2(top20.y+1), linetype="dotted") + geom_vline(xintercept = log2(top20.x+1), linetype="dotted") 

p1<-p + geom_point(aes(colour = paste(signWald)), alpha = 0.4) + geom_abline(intercept = 0, slope=1, col="blue")  
p2<-p + geom_point(aes(colour = paste(signLRT)), alpha = 0.4)  + geom_abline(intercept = 0, slope=1, col="blue")  

p3<-ggExtra::ggMarginal(p1, type="histogram", size=3, fill = "grey70")
p4<-ggExtra::ggMarginal(p2, type="histogram", size=3, fill = "grey70")

gridExtra::grid.arrange(p3, p4, nrow=1)

Figure. Scatter plots of transcript expression in control vs KD. The dotted line denotes area that contains the top 20 most highly expressed genes in the corresponding condition. Values log2FC>2 and significant: qvalue (Wald or LRT) < 0.05 are highlighted. The histograms above the y-axis and x-axis show the distribution of values. Blue line shows 1:1 ratio.

Note: Here, pale dots show values (“NA”) which did not pass various sleuth filters and should be ignored. Can be removed by “show_all=FALSE”.



Numerical summary, transcripts

First we prepare the data in a convenient table of all.

tab.all<-tab.plot %>% mutate("filt_out"=is.na(b), "type"=as.factor(ifelse(str_detect(target_id, "\\|"),"TE","RefSeq")))

Number of RepBase (TE) and other (protein-coding genes,ncRNA, etc) entries from RefSeq (all genes) analyzed; some were filtered out because they didn’t pass sleuth criteria (usually very low read counts or large discrepancy between reps).

tab.all %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out)
## # A tibble: 4 x 3
## # Groups:   type [2]
##   type   filt_out     n
##   <fct>  <lgl>    <int>
## 1 RefSeq FALSE    17975
## 2 TE     FALSE      166
## 3 RefSeq TRUE     15330
## 4 TE     TRUE        72

Number of significantly changed transcripts (qval 0.05, LRT or Walt Test; logFC (onscaled_reads_per_base)):

#Wald test
tab.all %>% filter(abs(log2FC.count)>1, qval.wald<0.05) %>% mutate("change"=ifelse(log2FC.count>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups:   type [2]
##   type   change     n
##   <fct>  <chr>  <int>
## 1 RefSeq up       187
## 2 TE     up        36
## 3 RefSeq down     159
## 4 TE     down       2
#LRT
tab.all %>% filter(abs(log2FC.count)>1, qval.lrt<0.05) %>% mutate("change"=ifelse(log2FC.count>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups:   type [2]
##   type   change     n
##   <fct>  <chr>  <int>
## 1 RefSeq up        56
## 2 TE     up        23
## 3 RefSeq down      64
## 4 TE     down       2

Summary of the numbers above:

From 17975 RefSeq transcripts and 166 RebBase entries which passed the initial filters, 187(56) and 36(23) are significantly up-regulated more than 2-fold in smt3 KD, and 159(64) or 2 are down-regulated more than 2-fold (qvalue 0.05, Wald test or LTR test).

Therefore, smt3 KD led to >2 fold up/down -regulation for ~1% of the transcripts (protein-coding and ncRNAs), and up-regulation of ~20% of the TEs.

R1 and R2 - from relatively lowly expressed genes in the control - become among the top 20 most abundant products in SUMO KD. See below:


Although many txs change, it is evident from the plots mostly modestly expressed ones show large FC, except for R1 and R2. If we filter txs with over 100 times change in expression log2FC>=~6.64, significant in at least one test:

tab.all %>% filter(log2FC.count>log2(100)) %>% filter(qval.wald<0.05 | qval.lrt<0.05) %>%
                    select(target_id, shW.1.count, shW.2.count, shSmt3.1.count, shSmt3.2.count, log2FC.count, type, b, qval.wald, qval.lrt) %>% mutate_at(2:5, funs(round(.,3))) %>% arrange(-( shSmt3.1.count*0.5+shSmt3.2.count*0.5)) %>% datatable(options=list(pageLength=5))

Results show that all other genes, apart from R1 and R2 in this set, are ones that practically have no reads in the control, and just few reads in the KD.


Numbers below show the expression of R1 and R2 elements; including rank of the average values. “X” and “Y” are average per control and KD, respectively:

*Note: Values can be seen as transcripts per million (TPM), see this explanation of TPM , or just counts.

tab.all.ranks1<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*count"),matches("(x|y).count")) %>% mutate(ctrl.rank.count=rank(desc(x.count)), kd.rank.count=rank(desc(y.count)))

tab.all.ranks2<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*tpm"),matches("(x|y).tpm")) %>% mutate(ctrl.rank.tpm=rank(desc(x.tpm)), kd.rank.tpm=rank(desc(y.tpm)))

tab.all.ranks1 %>% filter(str_detect(target_id, "^R[12]_"))
##                          target_id shW.1.count shSmt3.1.count shW.2.count shSmt3.2.count  x.count  y.count ctrl.rank.count kd.rank.count
## 1 R1_DM|Drosophila_fruit_fly_genus    239.2480       348313.7    243.4254       201125.2 241.3367 274719.5            9076             1
## 2 R2_DM|Drosophila_fruit_fly_genus    629.5107       194856.4    621.1112       197785.8 625.3109 196321.1            5702             4
tab.all.ranks2 %>% filter(str_detect(target_id, "^R[12]_"))
##                          target_id shW.1.tpm shSmt3.1.tpm shW.2.tpm shSmt3.2.tpm     x.tpm    y.tpm ctrl.rank.tpm kd.rank.tpm
## 1 R1_DM|Drosophila_fruit_fly_genus  2.740584     4013.985  2.790111     2315.011  2.765347 3164.498         11553          18
## 2 R2_DM|Drosophila_fruit_fly_genus 10.911782     3397.955 10.772657     3444.922 10.842219 3421.439          7863          16

Notably, R1 and R2 change from relatively lowly expressed elements, to the top 20 most abundant transcripts (in TPM). No other elements shows such a dramatic increase.



Gene-based analysis with p-value aggregation

Numbers of genes and TEs that pass the filters (same as in the gene-mode analysis)

tab.gene.stat<-full_join(sleuth_results_LRT_gene, sleuth_results_wald_gene, by="target_id", suffix=c(".lrt",".wald")) %>% mutate("filt_out"=is.na(qval.lrt),  "type"=as.factor(ifelse(str_detect(target_id, "\\|"),"TE","RefSeq")))

tab.gene.stat %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
##     type filt_out    n
## 1 RefSeq    FALSE 8996
## 2     TE    FALSE  166
## 3 RefSeq     TRUE 7616
## 4     TE     TRUE   72

Number of significantly changed elements, either direction:

#LRT
tab.gene.stat %>% filter(qval.lrt<0.05) %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
##     type filt_out   n
## 1 RefSeq    FALSE 773
## 2     TE    FALSE  51
#Wald test
tab.gene.stat %>% filter(qval.wald<0.05) %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out) %>% as.data.frame()
##     type filt_out    n
## 1 RefSeq    FALSE 1280
## 2     TE    FALSE   59

To introduce a fold-change cutoff here, we can use the original transcript counts. Note that each transcript has different fold change. Here, I extract genes based on aggregated qvals < 0.05, and for which at leat 1 transcript has >2-fold change.

table.counts<-left_join(table.counts, t2g, by="target_id")

#LRT 
sign<-tab.gene.stat %>% filter(qval.lrt<0.05)
merged<-inner_join(table.counts, sign, by=c("gene"="target_id"))
merged %>% filter(abs(log2FC)>1) %>% mutate("change"=ifelse(log2FC>0,"up","down")) %>% select(gene, type, change)  %>% distinct() %>% group_by(type, change) %>% tally() %>% arrange(change) %>% data_frame()
## # A tibble: 4 x 3
##   type   change     n
##   <fct>  <chr>  <int>
## 1 RefSeq down     221
## 2 TE     down       2
## 3 RefSeq up       209
## 4 TE     up        35
#Wald test 
sign<-tab.gene.stat %>% filter(qval.wald<0.05)
merged<-inner_join(table.counts, sign, by=c("gene"="target_id"))
merged %>% filter(abs(log2FC)>1) %>% mutate("change"=ifelse(log2FC>0,"up","down")) %>% select(gene, type, change)  %>% distinct() %>% group_by(type, change) %>% tally() %>% arrange(change) %>% data_frame()
## # A tibble: 4 x 3
##   type   change     n
##   <fct>  <chr>  <int>
## 1 RefSeq down     333
## 2 TE     down       3
## 3 RefSeq up       345
## 4 TE     up        37